Project Overview


Project Goal

The primary objective of this project is to develop a predictive model for determining the likelihood of credit card clients defaulting on their payments.


Checking the data

Load Libraries

library(readr) # Package for loading the csv file. 
library(dplyr) # Package for manipulate DATA .
library(ggplot2) # Package for visualize DATA .
library(plotly) # Package for visualizing DATA in a interactive way .
library(corrplot)  # Package for visualizing correlation matrices .
library(DescTools) # Package for descriptive statistics and visualization tools .
library(ltm)
library(vcd)
library(knitr)
library(kableExtra)
library(infotheo)
library(caret)
library(caTools)
library(randomForest)

Load Data

path_data<-"D:/project PFA (Default of Credit Card Clients Dataset) SAMER/Initial_Dataset/UCI_Credit_Card.csv"
UCI_Credit_Card<-read.csv(path_data)

Overview of the Data

head(UCI_Credit_Card)
##   ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1  1     20000   2         2        1  24     2     2    -1    -1    -2    -2
## 2  2    120000   2         2        2  26    -1     2     0     0     0     2
## 3  3     90000   2         2        2  34     0     0     0     0     0     0
## 4  4     50000   2         2        1  37     0     0     0     0     0     0
## 5  5     50000   1         2        1  57    -1     0    -1     0     0     0
## 6  6     50000   1         1        2  37     0     0     0     0     0     0
##   BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1      3913      3102       689         0         0         0        0      689
## 2      2682      1725      2682      3272      3455      3261        0     1000
## 3     29239     14027     13559     14331     14948     15549     1518     1500
## 4     46990     48233     49291     28314     28959     29547     2000     2019
## 5      8617      5670     35835     20940     19146     19131     2000    36681
## 6     64400     57069     57608     19394     19619     20024     2500     1815
##   PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default.payment.next.month
## 1        0        0        0        0                          1
## 2     1000     1000        0     2000                          1
## 3     1000     1000     1000     5000                          0
## 4     1200     1100     1069     1000                          0
## 5    10000     9000      689      679                          0
## 6      657     1000     1000      800                          0

Check the info of our DATA

str(UCI_Credit_Card)
## 'data.frame':    30000 obs. of  25 variables:
##  $ ID                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ LIMIT_BAL                 : num  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : int  2 2 2 2 1 1 1 2 2 1 ...
##  $ EDUCATION                 : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MARRIAGE                  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ AGE                       : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : int  2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : int  2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : int  -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : int  -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : int  -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : int  -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : num  3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2                 : num  3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3                 : num  689 2682 13559 49291 35835 ...
##  $ BILL_AMT4                 : num  0 3272 14331 28314 20940 ...
##  $ BILL_AMT5                 : num  0 3455 14948 28959 19146 ...
##  $ BILL_AMT6                 : num  0 3261 15549 29547 19131 ...
##  $ PAY_AMT1                  : num  0 0 1518 2000 2000 ...
##  $ PAY_AMT2                  : num  689 1000 1500 2019 36681 ...
##  $ PAY_AMT3                  : num  0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : num  0 1000 1000 1100 9000 ...
##  $ PAY_AMT5                  : num  0 0 1000 1069 689 ...
##  $ PAY_AMT6                  : num  0 2000 5000 1000 679 ...
##  $ default.payment.next.month: int  1 1 0 0 0 0 0 0 0 0 ...

We have :
- Categorical DATA set to int type we should pay attention to that.
- A gap between PAY_0 and PAY_2 we should investigate why.
- ID column is useless we should do something about it.
- Our target variable is default.payment.next.month.

Check is their any NA

colSums(is.na(UCI_Credit_Card)) #calculating in every col the NA value
##                         ID                  LIMIT_BAL 
##                          0                          0 
##                        SEX                  EDUCATION 
##                          0                          0 
##                   MARRIAGE                        AGE 
##                          0                          0 
##                      PAY_0                      PAY_2 
##                          0                          0 
##                      PAY_3                      PAY_4 
##                          0                          0 
##                      PAY_5                      PAY_6 
##                          0                          0 
##                  BILL_AMT1                  BILL_AMT2 
##                          0                          0 
##                  BILL_AMT3                  BILL_AMT4 
##                          0                          0 
##                  BILL_AMT5                  BILL_AMT6 
##                          0                          0 
##                   PAY_AMT1                   PAY_AMT2 
##                          0                          0 
##                   PAY_AMT3                   PAY_AMT4 
##                          0                          0 
##                   PAY_AMT5                   PAY_AMT6 
##                          0                          0 
## default.payment.next.month 
##                          0

We have no Missing DATA


Cleaning and Manipulating our DATA

Removing ID colomn

UCI_Credit_Card<-subset(UCI_Credit_Card,select = -ID) 

Changing the label of PAY_0 to PAY_1 AND default.payment.next.month to PaymentStatus

colnames(UCI_Credit_Card)[colnames(UCI_Credit_Card) == "PAY_0"] <- "PAY_1"
colnames(UCI_Credit_Card)[colnames(UCI_Credit_Card) == "default.payment.next.month"] <- "Payment_Status"

Creating a list that have column names of Quantitative variables and another one for Catogorical variables

col_names_Quanti<-c("LIMIT_BAL","AGE",
                      "BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
                      "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")
col_names_Cato<-c("SEX","EDUCATION","MARRIAGE",
                     "PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
                     "Payment_Status")
all_col_names<-c(col_names_Quanti,col_names_Cato)

Changing Catogorical variables type to Factors

for (nc in col_names_Cato) {
  UCI_Credit_Card[[nc]]<-as.factor(UCI_Credit_Card[[nc]])      
}

Univariate Analysis:

Qantitative DATA

summary(UCI_Credit_Card[col_names_Quanti])
##    LIMIT_BAL            AGE          BILL_AMT1         BILL_AMT2     
##  Min.   :  10000   Min.   :21.00   Min.   :-165580   Min.   :-69777  
##  1st Qu.:  50000   1st Qu.:28.00   1st Qu.:   3559   1st Qu.:  2985  
##  Median : 140000   Median :34.00   Median :  22382   Median : 21200  
##  Mean   : 167484   Mean   :35.49   Mean   :  51223   Mean   : 49179  
##  3rd Qu.: 240000   3rd Qu.:41.00   3rd Qu.:  67091   3rd Qu.: 64006  
##  Max.   :1000000   Max.   :79.00   Max.   : 964511   Max.   :983931  
##    BILL_AMT3         BILL_AMT4         BILL_AMT5        BILL_AMT6      
##  Min.   :-157264   Min.   :-170000   Min.   :-81334   Min.   :-339603  
##  1st Qu.:   2666   1st Qu.:   2327   1st Qu.:  1763   1st Qu.:   1256  
##  Median :  20089   Median :  19052   Median : 18105   Median :  17071  
##  Mean   :  47013   Mean   :  43263   Mean   : 40311   Mean   :  38872  
##  3rd Qu.:  60165   3rd Qu.:  54506   3rd Qu.: 50191   3rd Qu.:  49198  
##  Max.   :1664089   Max.   : 891586   Max.   :927171   Max.   : 961664  
##     PAY_AMT1         PAY_AMT2          PAY_AMT3         PAY_AMT4     
##  Min.   :     0   Min.   :      0   Min.   :     0   Min.   :     0  
##  1st Qu.:  1000   1st Qu.:    833   1st Qu.:   390   1st Qu.:   296  
##  Median :  2100   Median :   2009   Median :  1800   Median :  1500  
##  Mean   :  5664   Mean   :   5921   Mean   :  5226   Mean   :  4826  
##  3rd Qu.:  5006   3rd Qu.:   5000   3rd Qu.:  4505   3rd Qu.:  4013  
##  Max.   :873552   Max.   :1684259   Max.   :896040   Max.   :621000  
##     PAY_AMT5           PAY_AMT6       
##  Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1500.0   Median :  1500.0  
##  Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :426529.0   Max.   :528666.0

We have:
- Negative BILL_AMT ? We should investigate that .

Analysing LIMIT_BAL using histogram

plot_ly(x = UCI_Credit_Card$LIMIT_BAL, type = "histogram")%>%layout(title = "Histogram of LIMIT_BAL",
                                                                    xaxis=list(title ="LIMIT_BAL"),
                                                                    yaxis=list(title ="Frequency"))

50k is the most frequent LIMIT_BAL

Checking LIMIT_BAL Distribution unsing Q-Q plot

ggplot(data = UCI_Credit_Card, aes(sample = LIMIT_BAL)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for LIMIT_BAL")

Asymmetrical Distribution (positively skewed)

Analysing AGE using histogram

plot_ly(data = UCI_Credit_Card, x = ~AGE, type = "histogram") %>%
  layout(title = "Kernel Density Plot for Age",
         xaxis = list(title = "Age"),
         yaxis = list(title = "Frequence"))

The most frequent AGE is 29.

Checking AGEL Distribution unsing Q-Q plot

ggplot(data = UCI_Credit_Card, aes(sample = AGE)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for AGE")

Asymmetrical Distribution (positively skewed).

Analysing BILL_AMT(i) using histogram

BILL_AMT_LIST<-list("BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6")
histogram_list <- lapply(BILL_AMT_LIST, function(col_name) {
    plot_ly(UCI_Credit_Card, x = ~get(col_name), type = "histogram", histnorm = "probability density",nbinsx = 80,name = col_name) %>%
      layout(title = "Histograms of Bill Amounts")
  })
  combined_plot <- subplot(histogram_list, nrows = 3)
  combined_plot

We can’t see much from those plots

Using Q_Q plot to check the distrubution

1.BILL_AMT1

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT1)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT1")

2.BILL_AMT2

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT2)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT2")

3.BILL_AMT3

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT3)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT3")

4.BILL_AMT4

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT4)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT4")

5.BILL_AMT5

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT5)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT5")

6.BILL_AMT6

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT6)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT6")

The distribution of variables can influence both the correlation between variables and the choice of model used for analysis.

BILL_AMT(i) :Amount of bill statement in month (i), 2005 (NT dollar) .So ,what does negative value mean

negative_BILL_AMT<-UCI_Credit_Card %>%
  filter(BILL_AMT1 < 0 | BILL_AMT2 < 0 | BILL_AMT3 < 0 | BILL_AMT4 < 0 | BILL_AMT5 < 0 | BILL_AMT6 < 0)
head(negative_BILL_AMT)
##   LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_1 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1    100000   2         2        2  23     0    -1    -1     0     0    -1
## 2     60000   1         1        2  27     1    -2    -1    -1    -1    -1
## 3    160000   1         1        2  30    -1    -1    -2    -2    -2    -1
## 4     60000   2         2        2  22     0     0     0     0     0    -1
## 5    180000   2         3        1  34     0     0     0    -1    -1    -1
## 6    130000   2         3        2  29     1    -2    -2    -1     2    -1
##   BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1     11876       380       601       221      -159       567      380      601
## 2      -109      -425       259       -57       127      -189        0     1000
## 3     30265      -131      -527      -923     -1488     -1884      131      396
## 4     15054      9806     11068      6026    -28335     18660     1500     1518
## 5     16386     15793      8441      7142      -679      8321     8500     1500
## 6      -190     -9850     -9850     10311     10161      7319        0        0
##   PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 Payment_Status
## 1        0      581     1687     1542              0
## 2        0      500        0     1000              1
## 3      396      565      792        0              0
## 4     2043        0    47671      617              0
## 5     7500      679     9000     2000              0
## 6    20161        0     7319    13899              0

Hypothesis: It may imply that the bank owes them money, as they are making payments exceeding what is expected.

Using histogram visualisation for “PAY_AMT(i)”

PAY_AMT_list<-c("PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")

histogram_list <- lapply(PAY_AMT_list, function(col_name) {
    plot_ly(UCI_Credit_Card, x = ~get(col_name), type = "histogram",nbinsx = 70 ,name = col_name) %>%
      layout(title ="Histograms of Pay Amounts")
  })
  combined_plot <- subplot(histogram_list, nrows = 3)
  combined_plot

The distribution of PAY_AMT is heavily skewed and violates the assumption of normality. > We can consider applying a log transformation to the data to make the distribution more symmetric.

# Define a small constant to add to the variable
epsilon <- 1e-10


histogram_list <- lapply(PAY_AMT_list, function(col_name) {
    plot_ly(UCI_Credit_Card, x = ~log(get(col_name) + epsilon), type = "histogram", nbinsx = 70 , name = col_name) %>%
      layout(title = "Histograms of Pay Amounts")
})


combined_plot <- subplot(histogram_list, nrows = 3)
combined_plot

Catagorical DATA

summary(UCI_Credit_Card[col_names_Cato])
##  SEX       EDUCATION MARRIAGE      PAY_1           PAY_2           PAY_3      
##  1:11888   0:   14   0:   54   0      :14737   0      :15730   0      :15764  
##  2:18112   1:10585   1:13659   -1     : 5686   -1     : 6050   -1     : 5938  
##            2:14030   2:15964   1      : 3688   2      : 3927   -2     : 4085  
##            3: 4917   3:  323   -2     : 2759   -2     : 3782   2      : 3819  
##            4:  123             2      : 2667   3      :  326   3      :  240  
##            5:  280             3      :  322   4      :   99   4      :   76  
##            6:   51             (Other):  141   (Other):   86   (Other):   78  
##      PAY_4           PAY_5           PAY_6       Payment_Status
##  0      :16455   0      :16947   0      :16286   0:23364       
##  -1     : 5687   -1     : 5539   -1     : 5740   1: 6636       
##  -2     : 4348   -2     : 4546   -2     : 4895                 
##  2      : 3159   2      : 2626   2      : 2766                 
##  3      :  180   3      :  178   3      :  184                 
##  4      :   69   4      :   84   4      :   49                 
##  (Other):  102   (Other):   80   (Other):   80

Using Pie Chart for SEX

SEX_counts <- table(UCI_Credit_Card$SEX)

plot_ly(labels = c("Male", "Female"), values = SEX_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for SEX",
      font = list(color = "black", size = 20),
      showarrow = FALSE),piecolorway = c( "pink","blue"))

We have more than half female

Using Pie Chart for EDUCATION

EDUCATION_counts <- table(UCI_Credit_Card$EDUCATION)
EDUCATION_counts
## 
##     0     1     2     3     4     5     6 
##    14 10585 14030  4917   123   280    51
plot_ly(labels = list("unlabel","graduate school","university", "high school", "others", "unknown", "unknown"), values = EDUCATION_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for EDUCATION",
      font = list(color = "black", size = 15),
      showarrow = FALSE))

Using Pie Chart for MARRIAGE

MARRIAGE_counts <- table(UCI_Credit_Card$MARRIAGE)
MARRIAGE_counts
## 
##     0     1     2     3 
##    54 13659 15964   323
plot_ly(labels = c("unlabel","married", "single", "others"), values = MARRIAGE_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for MARRIAGE",
      font = list(color = "black", size = 15),
      showarrow = FALSE))

Using barplot for “PAY_0”

labels_PAY <- c("-2" = "unlabel(-2)", "-1" = "pay duly", "0" = "unlabel(0)",
                  "1" = "pay_del_1_mth", "2" = "pay_del_2_mths",
                  "3" = "pay_del_3_mths", "4" = "pay_del_4_mths",
                  "5" = "pay_del_5_mths", "6" = "pay_del_6_mths",
                  "7" = "pay_del_7_mths", "8" = "pay_del_8_mths",
                  "9" = "pay_del_9_mths_above")
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_1, fill = PAY_1)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_1", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_2”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_2, fill = PAY_2)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_2", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_3”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_3, fill = PAY_3)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_3", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_4”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_4, fill = PAY_4)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_4", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_5”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_5, fill = PAY_5)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_5", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_6”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_6, fill = PAY_6)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_6", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

we should investigate unlabel(0)

Using Charpie for Payment_Status :

PayStatue_counts <- table(UCI_Credit_Card$Payment_Status)

plot_ly(labels = c("Non_default", "Default"), values = PayStatue_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for Payment_Status",
      font = list(color = "black", size = 14),
      showarrow = FALSE),piecolorway = c("lightgreen", "lightcoral"))

Nearly 80 % did not default

Correlation Test :

Quantitave data and Payment Status Using Point-Biserial Correlation Coefficient :

correlation_results <- lapply(col_names_Quanti, function(var) {
  biserial.cor(UCI_Credit_Card[[var]], UCI_Credit_Card$Payment_Status)
})

correlation_df <- data.frame(
  Variable = col_names_Quanti,
  Correlation = unlist(correlation_results)
)


knitr::kable(correlation_df, caption = "<h2 style='color: black;'>Correlation between Payment Status and Quantitative Variables</h2>") %>%kable_styling(bootstrap_options = c("striped", "hover"))

Correlation between Payment Status and Quantitative Variables

Variable Correlation
LIMIT_BAL 0.1535199
AGE -0.0138898
BILL_AMT1 0.0196442
BILL_AMT2 0.0141932
BILL_AMT3 0.0140755
BILL_AMT4 0.0101565
BILL_AMT5 0.0067605
BILL_AMT6 0.0053723
PAY_AMT1 0.0729295
PAY_AMT2 0.0585787
PAY_AMT3 0.0562504
PAY_AMT4 0.0568274
PAY_AMT5 0.0551235
PAY_AMT6 0.0531833
rank_biserial_cor <- function(x, y) {
  cor(x, as.numeric(y), method = "spearman")
}

correlation_results <- lapply(col_names_Quanti, function(var) {
  rank_biserial_cor(UCI_Credit_Card[[var]], UCI_Credit_Card$Payment_Status)
})

correlation_df <- data.frame(
  Variable = col_names_Quanti,
  Correlation = unlist(correlation_results)
)


kable(correlation_df, caption = "<h2 style='color: black;'>Correlation between Payment Status and Quantitative Variables (Rank-Biserial)</h2>") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Correlation between Payment Status and Quantitative Variables (Rank-Biserial)

Variable Correlation
LIMIT_BAL -0.1695863
AGE 0.0051489
BILL_AMT1 -0.0253268
BILL_AMT2 -0.0155538
BILL_AMT3 -0.0126699
BILL_AMT4 -0.0083571
BILL_AMT5 -0.0068512
BILL_AMT6 -0.0000761
PAY_AMT1 -0.1604931
PAY_AMT2 -0.1509774
PAY_AMT3 -0.1393880
PAY_AMT4 -0.1279786
PAY_AMT5 -0.1165871
PAY_AMT6 -0.1214436

Catagorical data and Payment Status Using CramérV :

cramer_v_results <- list()

for (var in col_names_Cato) {
  
  contingency_table <- table(UCI_Credit_Card[[var]], UCI_Credit_Card$Payment_Status)
  
  cramers_v <- assocstats(contingency_table)$cramer
  
  
  cramer_v_results[[var]] <- cramers_v
}

cramer_df <- data.frame(
  Cramer_V = unlist(cramer_v_results)
)

knitr::kable(cramer_df, caption = "<h2 style='color: black;'>Cramer's V for Categorical Variables and Payment Status</h2>") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Cramer’s V for Categorical Variables and Payment Status

Cramer_V
SEX 0.0399606
EDUCATION 0.0737601
MARRIAGE 0.0344782
PAY_1 0.4229249
PAY_2 0.3403169
PAY_3 0.2956610
PAY_4 0.2793725
PAY_5 0.2706594
PAY_6 0.2507878
Payment_Status 1.0000000

Mutual Information Test :

mutual_info <- function(x, y) {
  mutinformation(x, y)
}

results_df <- data.frame(
  Variable_1 = character(), 
  Variable_2 = character(),
  Mutual_Information = numeric() 
)


for (i in 1:(length(col_names_Quanti)-1)) {
  for (j in (i + 1):length(col_names_Quanti)) {
    mi <- mutual_info(UCI_Credit_Card[[col_names_Quanti[i]]], UCI_Credit_Card[[col_names_Quanti[j]]])
    results_df <- rbind(results_df, list(col_names_Quanti[i], col_names_Quanti[j], mi))
  }
}
colnames(results_df)<-c("Variable_1","Variable_2","Mutual_Information")

kable(results_df, caption = "<h2 style='color: black;'>Mutual Information between Independent Quantitative Variables <h2>") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Mutual Information between Independent Quantitative Variables

Variable_1 Variable_2 Mutual_Information
LIMIT_BAL AGE 0.1399469
LIMIT_BAL BILL_AMT1 3.0363695
LIMIT_BAL BILL_AMT2 2.9865141
LIMIT_BAL BILL_AMT3 2.9442229
LIMIT_BAL BILL_AMT4 2.9001399
LIMIT_BAL BILL_AMT5 2.8511692
LIMIT_BAL BILL_AMT6 2.8070368
LIMIT_BAL PAY_AMT1 1.5899128
LIMIT_BAL PAY_AMT2 1.5645590
LIMIT_BAL PAY_AMT3 1.4919496
LIMIT_BAL PAY_AMT4 1.4233663
LIMIT_BAL PAY_AMT5 1.4051677
LIMIT_BAL PAY_AMT6 1.4030709
AGE BILL_AMT1 2.9701020
AGE BILL_AMT2 2.9188466
AGE BILL_AMT3 2.8770316
AGE BILL_AMT4 2.8293967
AGE BILL_AMT5 2.7739410
AGE BILL_AMT6 2.7191910
AGE PAY_AMT1 1.4092346
AGE PAY_AMT2 1.3983416
AGE PAY_AMT3 1.3320564
AGE PAY_AMT4 1.2503459
AGE PAY_AMT5 1.2276530
AGE PAY_AMT6 1.2182976
BILL_AMT1 BILL_AMT2 8.8891250
BILL_AMT1 BILL_AMT3 8.7431332
BILL_AMT1 BILL_AMT4 8.6173751
BILL_AMT1 BILL_AMT5 8.4869709
BILL_AMT1 BILL_AMT6 8.3257439
BILL_AMT1 PAY_AMT1 6.1601206
BILL_AMT1 PAY_AMT2 6.1112345
BILL_AMT1 PAY_AMT3 5.8700636
BILL_AMT1 PAY_AMT4 5.6392806
BILL_AMT1 PAY_AMT5 5.5318309
BILL_AMT1 PAY_AMT6 5.4650628
BILL_AMT2 BILL_AMT3 8.7578000
BILL_AMT2 BILL_AMT4 8.6087204
BILL_AMT2 BILL_AMT5 8.4634108
BILL_AMT2 BILL_AMT6 8.2931153
BILL_AMT2 PAY_AMT1 6.3681981
BILL_AMT2 PAY_AMT2 6.1055936
BILL_AMT2 PAY_AMT3 5.8603709
BILL_AMT2 PAY_AMT4 5.6158353
BILL_AMT2 PAY_AMT5 5.5088153
BILL_AMT2 PAY_AMT6 5.4238519
BILL_AMT3 BILL_AMT4 8.6465863
BILL_AMT3 BILL_AMT5 8.4798836
BILL_AMT3 BILL_AMT6 8.2976876
BILL_AMT3 PAY_AMT1 6.0595569
BILL_AMT3 PAY_AMT2 6.3305476
BILL_AMT3 PAY_AMT3 5.8690754
BILL_AMT3 PAY_AMT4 5.6231800
BILL_AMT3 PAY_AMT5 5.5047360
BILL_AMT3 PAY_AMT6 5.4247965
BILL_AMT4 BILL_AMT5 8.4934053
BILL_AMT4 BILL_AMT6 8.2970711
BILL_AMT4 PAY_AMT1 5.9425831
BILL_AMT4 PAY_AMT2 6.0165910
BILL_AMT4 PAY_AMT3 6.0890847
BILL_AMT4 PAY_AMT4 5.6067962
BILL_AMT4 PAY_AMT5 5.5010147
BILL_AMT4 PAY_AMT6 5.4042946
BILL_AMT5 BILL_AMT6 8.3267993
BILL_AMT5 PAY_AMT1 5.8317209
BILL_AMT5 PAY_AMT2 5.8873452
BILL_AMT5 PAY_AMT3 5.7546921
BILL_AMT5 PAY_AMT4 5.8501962
BILL_AMT5 PAY_AMT5 5.4999604
BILL_AMT5 PAY_AMT6 5.4063811
BILL_AMT6 PAY_AMT1 5.7026072
BILL_AMT6 PAY_AMT2 5.7563161
BILL_AMT6 PAY_AMT3 5.6205288
BILL_AMT6 PAY_AMT4 5.4985666
BILL_AMT6 PAY_AMT5 5.7709690
BILL_AMT6 PAY_AMT6 5.4245663
PAY_AMT1 PAY_AMT2 4.2847497
PAY_AMT1 PAY_AMT3 4.0969641
PAY_AMT1 PAY_AMT4 3.8755556
PAY_AMT1 PAY_AMT5 3.7669638
PAY_AMT1 PAY_AMT6 3.7048184
PAY_AMT2 PAY_AMT3 4.1516475
PAY_AMT2 PAY_AMT4 3.9520788
PAY_AMT2 PAY_AMT5 3.8241769
PAY_AMT2 PAY_AMT6 3.7619519
PAY_AMT3 PAY_AMT4 3.9099853
PAY_AMT3 PAY_AMT5 3.8129627
PAY_AMT3 PAY_AMT6 3.7079715
PAY_AMT4 PAY_AMT5 3.8003864
PAY_AMT4 PAY_AMT6 3.7170818
PAY_AMT5 PAY_AMT6 3.7547790
mutual_info <- function(x, y) {
  mutinformation(x, y)
}

results_df <- data.frame(
  Feature_1 = character(),
  Feature_2 = character(),
  Mutual_Information = numeric() 
)
new<-col_names_Cato[-10]


for (i in 1:(length(new)-1)) {
  for (j in (i + 1):length(new)) {
    mi <- mutual_info(UCI_Credit_Card[[new[i]]], UCI_Credit_Card[[new[j]]])
    results_df <- rbind(results_df, list(new[i], new[j], mi))
  }
}
colnames(results_df)<-c("Variable_1","Variable_2","Mutual_Information")

kable(results_df, caption = "<h2 style='color: black;'>Mutual Information between Independent Catagorical Variables<h2>") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Mutual Information between Independent Catagorical Variables

Variable_1 Variable_2 Mutual_Information
SEX EDUCATION 0.0005135
SEX MARRIAGE 0.0005546
SEX PAY_1 0.0023480
SEX PAY_2 0.0029846
SEX PAY_3 0.0027330
SEX PAY_4 0.0022314
SEX PAY_5 0.0017651
SEX PAY_6 0.0012304
EDUCATION MARRIAGE 0.0190177
EDUCATION PAY_1 0.0207493
EDUCATION PAY_2 0.0241148
EDUCATION PAY_3 0.0226826
EDUCATION PAY_4 0.0192763
EDUCATION PAY_5 0.0163640
EDUCATION PAY_6 0.0153678
MARRIAGE PAY_1 0.0022219
MARRIAGE PAY_2 0.0021541
MARRIAGE PAY_3 0.0019783
MARRIAGE PAY_4 0.0023897
MARRIAGE PAY_5 0.0021056
MARRIAGE PAY_6 0.0018328
PAY_1 PAY_2 0.8195850
PAY_1 PAY_3 0.4918402
PAY_1 PAY_4 0.3878989
PAY_1 PAY_5 0.3267540
PAY_1 PAY_6 0.2950993
PAY_2 PAY_3 0.6742030
PAY_2 PAY_4 0.4676819
PAY_2 PAY_5 0.3843708
PAY_2 PAY_6 0.3420818
PAY_3 PAY_4 0.6464694
PAY_3 PAY_5 0.4587498
PAY_3 PAY_6 0.3954185
PAY_4 PAY_5 0.6438151
PAY_4 PAY_6 0.4647349
PAY_5 PAY_6 0.6356835

Applying Logistic regression :

set.seed(420) 
train_index <- sample.split(UCI_Credit_Card$Payment_Status, SplitRatio = 0.7)
train_data <- UCI_Credit_Card[train_index, ]
test_data <- UCI_Credit_Card[-train_index, ]


model <- glm(Payment_Status ~ ., data = train_data, family = binomial)



predictions <- predict(model, newdata = test_data, type = "response")


predicted_classes <- ifelse(predictions > 0.5, 1, 0)




# Create a confusion matrix
confusion_matrix <- confusionMatrix(data = factor(test_data$Payment_Status),
                                    reference = factor(predicted_classes))

# Print the confusion matrix
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 22284  1080
##          1  4284  2351
##                                           
##                Accuracy : 0.8212          
##                  95% CI : (0.8168, 0.8255)
##     No Information Rate : 0.8856          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3725          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8388          
##             Specificity : 0.6852          
##          Pos Pred Value : 0.9538          
##          Neg Pred Value : 0.3543          
##              Prevalence : 0.8856          
##          Detection Rate : 0.7428          
##    Detection Prevalence : 0.7788          
##       Balanced Accuracy : 0.7620          
##                                           
##        'Positive' Class : 0               
## 

Applying Random Forest :

rf_model <- randomForest(Payment_Status ~ ., data = train_data)

rf_predictions <- predict(rf_model, newdata = test_data)


rf_confusion_matrix <- confusionMatrix(data = factor(test_data$Payment_Status),
                                       reference = factor(rf_predictions))


print(rf_confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 23011   353
##          1  1401  5234
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.9388, 0.9442)
##     No Information Rate : 0.8138          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8201          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9426          
##             Specificity : 0.9368          
##          Pos Pred Value : 0.9849          
##          Neg Pred Value : 0.7888          
##              Prevalence : 0.8138          
##          Detection Rate : 0.7671          
##    Detection Prevalence : 0.7788          
##       Balanced Accuracy : 0.9397          
##                                           
##        'Positive' Class : 0               
##